###########################################################################
# Lipps & Sczepanski & Malet
# Understanding Preferences over Borders
# International Studies Quaterly

# Main: Analysis of open-question -----------------------------------------
###########################################################################

rm(list = ls())

packages <- c("tidyverse", "stringr", "tidytext", "ggplot2", "xtable")

for (package in packages) {
  if (!requireNamespace(package, quietly = TRUE)) {
    install.packages(package, dependencies = TRUE)
  }
}

library(tidyverse)
library(tidytext)
library(ggplot2)
library(stringr)
library(xtable)

dat <- read.csv("survey_dat_clean.csv", na.strings=c("","NA"))

variables <- c("border_asso_1_trans", "border_asso_2_trans", "border_asso_3_trans", "border_asso_4_trans")

#function to replace answers like "..."
replace_func <- function(data, var){
  for(i in 1:length(data[,var])){
    data[,var][i] <- gsub("[[:punct:]]", "", data[,var][i])
  }
  return(data)
}

for(variable in variables){
  dat <- replace_func(dat, variable)
}

#subset to relevant vars
dat_small <-dat %>% 
  dplyr::select(c("id", contains(c("_trans", "border_asso_ratings"))),
                border_open_close_1)

#create long data set with a row for each respondent*word
dat_small_trans<- dat_small %>% 
  dplyr::select(c("id", contains(c("_trans"))), border_open_close_1) %>% 
  pivot_longer(cols = contains(c("_trans")),
               names_to = "word_num",
               values_to = "word")

#same for the ratings
dat_small_rat<-dat_small %>% 
  dplyr::select(id, c(contains(c("_ratings")))) %>%
  pivot_longer(cols=!id,
               names_to = "word_rat",
               values_to = "rating")


dat_word_all<- cbind(dat_small_trans, dat_small_rat[,2:3])

#dropping variables
dat_word_all <- tibble(dplyr::select(dat_word_all, -word_rat, -word_num))

#cleaning
#correct for translation of zaun and passport
dat_word_all$word <- ifelse(dat_word_all$word=="zaun", "fence", dat_word_all$word)
dat_word_all$word <- ifelse(dat_word_all$word=="pass", "passport", dat_word_all$word)
#further clean
dat_word_all$word <- str_squish(dat_word_all$word)
dat_word_all$word <- str_replace_all(dat_word_all$word, "[\\r\\n\\t]+", "")
nas <- c("nan", "ka", "nix", "x", "know", "knowing", "knowledge", "nn", "k", "non", "so", "", "xx", "kp", " ", "fasfd")
dat_word_all %>%
  filter(!word %in% nas) %>%
  filter(!is.na(word)) -> dat_word_all

#code groups: open, restricted, closed
dat_word_all <- dat_word_all %>%
  mutate(open_cat = case_when(border_open_close_1>6 ~ "closed",
                              border_open_close_1>=4 & border_open_close_1<=6 ~ "restricted",
                              border_open_close_1<4 ~ "open")) #median is 5

#summary
summary_dat <- dat_word_all %>% 
  group_by(word) %>% 
  summarise(mean_rat = mean(as.numeric(rating), na.rm=TRUE),
            freq= n()) %>% 
  ungroup() %>% 
  filter(freq > 1)

#select the 20 most frequent words
slctwords <- summary_dat %>%
  filter(freq %in% tail(sort(unique(freq)),20)) %>% 
  arrange(desc(freq))

#summary by open-closefor 20 most frequent words
summary_open <- dat_word_all %>% 
  filter(!word %in% nas) %>%
  filter(!is.na(word)) %>%
  group_by(word, open_cat) %>%
  summarise(mean_rat = mean(as.numeric(rating), na.rm=TRUE),
            freq= n()) %>% 
  mutate(grpsize = case_when(open_cat=="open" ~ 1431,
                             open_cat=="restricted" ~ 1771,
                             open_cat=="closed" ~ 1535),
         relfreq=freq/grpsize) %>%
  ungroup() %>% 
  filter(freq > 1) 

slctwords_open <- summary_open %>%
  filter(word %in% slctwords$word)

#-----------------------------------------------------------------------------------------------

### FIGURE 1 ###
### lollipop plot of 20 most frequent words and their rating by group

pdf(file = "fig_1.pdf",width = 8,height = 5,onefile=F)

slctwords_open %>% 
  filter(!is.na(open_cat)) %>%
  arrange(relfreq) %>% 
  ggplot(aes(y = reorder(word, mean_rat), label = word, x = mean_rat)) +
  geom_point(aes(shape = open_cat, colour=mean_rat), size=4, position = position_dodge(width = .5)) +
  geom_linerange(aes(y = word, xmin = 0, xmax = mean_rat, group= open_cat, colour = mean_rat), 
                 position = position_dodge(width = .5))+
  labs(x = "Average Rating", y = NULL,
       subtitle = "Top 20 most frequent associations with 'border' (pooled sample)") +
  theme_minimal() + 
  scale_color_viridis_c(limits=c(-5,5), guide = "none", option="C")+
  scale_x_continuous(limits=c(-5,5)) +
  #scale_y_discrete(expand=c(5, -5)) +
  scale_shape_manual(breaks=c("open", "restricted", "closed"), values=c(16,15,17)) +
  geom_vline(xintercept=0, linetype ="dashed") +
  #scale_fill_gradientn(colours=c("red", "yellow", "beige", "green", "forestgreen")) +
  labs(shape = "")+
  theme(
    plot.subtitle = element_text(hjust = .5, size = 16),
    axis.text.y.left = element_text(size = 12),
    axis.text.x = element_text(angle = -45, size=16, hjust=0.1, vjust=0.5),
    legend.text = element_text(size= 12),
    axis.title.y= element_text(size = 14)) +
  coord_flip()

dev.off()

#--------------------------------------------------------------------------------

#Appendix Table A.1
### make a summary table of 20 most frequent words by group
summary_open %>%
  group_by(open_cat) %>%
  filter(freq %in% tail(sort(freq),20)) %>% 
  ungroup() %>%
  distinct(word) -> listtops

summary_open %>% 
  filter(word %in% listtops$word) %>%
  mutate(open_cat = factor(open_cat, levels=c("open", "restricted", "closed")),
         relfreq = round(relfreq*100, 1)) %>%
  group_by(open_cat) %>%
  mutate(rank = rank(-relfreq, ties.method = "last")) %>%
  ungroup() %>%
  complete(word, open_cat) ->topsbycat


topsbycat %>%
  pivot_wider(id_cols=word, id_expand = T, 
              names_from= "open_cat", values_from = "rank") -> apptab

xtable(apptab)
